package com.company.reptile;

import org.apache.log4j.Logger;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/*描述 网络爬虫  多线程
尝试爬取某个茶座前100页的内容
 * @authro szq
 * @time 2022/01/05*/
public class ReptileFoundation {


    private static Logger log = Logger.getLogger(ReptileFoundation.class);
    private static String rootPath = "http://fundf10.eastmoney.com/jjjz_008279.html";

    public static void main(String[] args) throws InterruptedException {


        String urlPath = rootPath + "jjjz_" + "008279" + ".html";

        AccessHtml(urlPath,"1");
        AccessHtml(urlPath,"2");
        AccessHtml(urlPath,"3");
        AccessHtml(urlPath,"4");
        AccessHtml(urlPath,"5");
        AccessHtml(urlPath,"6");
        AccessHtml(urlPath,"7");
        AccessHtml(urlPath,"8");
        AccessHtml(urlPath,"9");
        AccessHtml(urlPath,"10");


    }

    public static String getHtml(String url,String pageNum) {
        String htmlStr = "";
        String jsPath = "D:\\java_web\\JavaCourse\\src\\com\\company\\reptile\\myjs.js";
        String exePath = "D:\\java_web\\JavaCourse\\src\\com\\company" +
                "\\reptile\\phantomjs-2.1.1-windows\\bin\\phantomjs.exe";
        System.out.println(jsPath);
        System.out.println(exePath);
        Runtime rt = Runtime.getRuntime();
        Process p;
        try {
            p = rt.exec(exePath + " " + jsPath + " " + url + " " + pageNum);

            InputStream is = p.getInputStream();
            BufferedReader br = new BufferedReader(new InputStreamReader(is));
            StringBuffer sbf = new StringBuffer();
            String tmp = "";
            while ((tmp = br.readLine()) != null) {
                sbf.append(tmp);
            }
            htmlStr = sbf.toString();

            is.close();
            br.close();
            sbf = null;
            is = null;
            br = null;
        } catch (IOException e) {

            e.printStackTrace();
        }
        return htmlStr;

    }

    /*爬取网页源代码并处理
     * @param requestUrl String 请求地址
     * @return */
    public static void AccessHtml(String urlPath,String pageNum) {
        String msg = getHtml(urlPath,pageNum);
        msg = msg.replace("\n", "").replace("\t", "");
        // ?:0|1;  *:0|n  +:1|n+
        String pattern = "<tr class=\"\".+?/tr>";
        Matcher m = Pattern.compile(pattern).matcher(msg);
        while (m.find()) {
//            for (int i = 0; i <= m.groupCount(); i++) {//这里 i<m.groupCount();改成i<=m.groupCount();
            String msg2 = m.group(0);
            // ?:0|1;  *:0|n  +:1|n
            String pattern2 = "<td.*?/td>";
            Matcher m2 = Pattern.compile(pattern2).matcher(msg2);
            while (m2.find()) {
                String msg3 = m2.group(0);
//                    <a href="chazuo/15469178" name="readlink"
//                      id="a_ajax_15469178" class="subject_t fs14"
//                      title="在常州找一个好的中医，难吗？今日话题：说说你遇见过的好中医！" style="">
//                    <b><font color="#FF0000">在常州找一个好的中医，难吗？今日话题：说说你遇见过的好中医！</font></b></a>
//                    String pattern3 = "<a href=\"(.*)\" name=\"readlink\".*title=\"(.*)\" .*/a>";
                // ?:0|1;  *:0|n  +:1|n
                String pattern3 = "<td.*?>(.*?)</td>";
                Matcher m3 = Pattern.compile(pattern3).matcher(msg3);
                while (m3.find()) {

                    System.out.println(m3.group(1));
                }
            }
//            }
        }
    }

}